
#Story analysis
arts <- read.csv("NER - Articles Dataset - 2018 Final.csv", sep=";")

#histogram
subarts <- subset(arts, arts$time_after_initial>0)

#this is the one we used
library(ggplot2)
ggplot(subarts, aes(x=time_after_initial)) + geom_histogram(binwidth=1) + scale_x_continuous(breaks=seq(0,12*24,24)) + theme_bw() + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) + ylab('') + xlab("Number of hours after initial publication")
ggsave("Average Story Evolution Over Time.png")

#how many articles are part of a story?
subarts <- subset(arts, arts$story.size > 1)
nrow(subarts)
nrow(subarts)/nrow(arts)

#and in their own paper
subarts <- subset(arts, arts$story.size.paper > 1)
nrow(subarts)/nrow(arts)

#how many clusters?
df <- data.frame(table(arts$story))
nrow(df)
#some of these are singletons of course
nrow(df[df$Freq>1,])


#How is cluster size distributed
#now we are looking only at 'single paper' clusters
#there are more single paper clusters than there are overall clusters
#as of course, for example, Boston is one big cluster with 7
#paper clusters within it
clusts <- read.csv("NER - Story Dataset - Single Paper.csv", sep=";")
subclusts <- subset(clusts, clusts$num_articles < 6)
nrow(subclusts)
sum(subclusts$num_articles)
sum(subclusts$num_articles)/sum(clusts$num_articles)
mean(subclusts$duration)/24

subclusts <- subset(clusts, clusts$num_articles >= 6 & clusts$num_articles <= 10)
nrow(subclusts)
sum(subclusts$num_articles)
sum(subclusts$num_articles)/sum(clusts$num_articles)
mean(subclusts$duration)/24

subclusts <- subset(clusts, clusts$num_articles >= 11 & clusts$num_articles <= 20)
nrow(subclusts)
sum(subclusts$num_articles)
sum(subclusts$num_articles)/sum(clusts$num_articles)
mean(subclusts$duration)/24

subclusts <- subset(clusts, clusts$num_articles >= 21 & clusts$num_articles <= 30)
nrow(subclusts)
sum(subclusts$num_articles)/sum(clusts$num_articles)
mean(subclusts$duration)/24

subclusts <- subset(clusts, clusts$num_articles > 30)
nrow(subclusts)
sum(subclusts$num_articles)/sum(clusts$num_articles)
mean(subclusts$duration)/24


mean(clusts$duration)/24

